This experiment aims to characterize the effects of Her3 knockout in zebrafish.
To do this, we employ a Her3 mutant created by crispr modification of the Her3 sequence that leads to a truncated 38 amino acid protein.
We also are using a morpholino targeted to the Her3 mRNA to prevent protein translation.
We are testing these effects at both 24 hours and 72 hours post fertilization.
There are six groups with samples collected at either 24hpf or 72hpf. RNA from the entire embryo was collected for all samples.
ERCC spike-in: 24hpf WIK samples had ERCC 1, and all the others had ERCC 2 spike-in controls added to facilitate evaluation of dynamic range and sensitivity of differential expression analysis.
RNAseq data were generated on an Illumina NovaSeq to create 151bp paired-end reads.
To analyze these samples, we aligned with HiSat2 and counted the number of reads mapping to each gene using featureCounts. We used EdgeR to test for differential expression between the groups. For motif analysis we used Homer. For most statistical analyses and plotting, we used R and and ggplot2. To do pathway analysis we used the Inginuity Pathway Analysis software.
library(tidyverse)
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
options(bitmaptype = "cairo")
library(gt)
library(edgeR)
for directoryName in \
misc \
slurmOut \
sbatchCmds \
output/fastqc \
output/figures \
output/figures/ERCC \
output/ERCC/aligned \
output/aligned/counts \
output/geneCounts \
output/counts \
output/motif/homerOut
do
if [ ! -d ${directoryName} ]
then
mkdir -p ${directoryName}
fi
done
#!/bin/sh
#SBATCH --account=gdtheisenlab
#SBATCH --error=slurmOut/fastqc-%j.txt
#SBATCH --output=slurmOut/fastqc-%j.txt
#SBATCH --mem=2G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=5
#SBATCH --job-name fastqc
#SBATCH --wait
#SBATCH --array=0-71
#SBATCH --time=1-00:00:00
#SBATCH --mail-user=matthew.cannon@nationwidechildrens.org
#SBATCH --mail-type=FAIL,REQUEUE,TIME_LIMIT_80
set -e ### stops bash script if line ends with error
echo ${HOSTNAME} ${SLURM_ARRAY_TASK_ID}
module purge
module load FastQC/0.11.9-Java-11.0.2
inputPath=/home/gdkendalllab/lab/raw_data/fastq/2021_10_07
fileArray=(${inputPath}/[hmW]*/*fastq.gz)
fastqc \
-o output/fastqc \
-t 5 \
--extract \
${fileArray[${SLURM_ARRAY_TASK_ID}]}
#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
##############################
# By Matt Cannon
# Date: 6-22-21
# Last modified: 6-22-21
# Title: splitFastqcData.pl
# Purpose: Split fastqc_data.txt into sub-files based on modules
##############################
##############################
# Options
##############################
my $verbose;
my $help;
my $input;
my $outputDir = "";
# i = integer, s = string
GetOptions ("verbose" => \$verbose,
"help" => \$help,
"input=s" => \$input,
"outputDir=s" => \$outputDir
) or pod2usage(0) && exit;
pod2usage(1) && exit if ($help);
##############################
# Global variables
##############################
my $outputFH;
my $baseColumn;
##############################
# Code
##############################
if ($outputDir eq "") {
$input =~ /^(.+\/)/;
$outputDir = $1;
mkdir $outputDir . "/split_data"
}
##############################
### Stuff
### More stuff
open my $inputFH, "$input" or die "Could not open first input\n$!";
while (my $input = <$inputFH>){
chomp $input;
if ($input =~ /^>>END_MODULE/) {
# Close file
close $outputFH;
undef $baseColumn;
} elsif ($input =~ /^>>/) {
# Open new file
$input =~ s/^>>//;
$input =~ s/ /_/g;
$input =~ s/\t.+//;
open $outputFH, ">", $outputDir . "/split_data/" . $input . ".txt";
} elsif ($input !~ /^##/) {
# Deal with data
## Find column that has base numbering
if ($input =~ /Base|Position/ && $input =~ /^#/) {
# Change the word Position to Base to make downstream plotting easier
$input =~ s/Position/Base/;
my @dataArray = split "\t", $input;
for (my $i = 0; $i < scalar(@dataArray); $i++) {
if ($dataArray[$i] =~ /^#*Base$/) {
$baseColumn = $i;
}
}
}
# write to file
if (defined($baseColumn)) {
my @dataArray = split "\t", $input;
if ($dataArray[$baseColumn] =~ /-/) {
my ($lowerNum, $upperNum) = split "-", $dataArray[$baseColumn];
for (my $i = $lowerNum; $i <= $upperNum; $i++) {
my @tempArray = @dataArray;
$tempArray[$baseColumn] = $i;
print $outputFH join("\t", @tempArray), "\n";
}
} else {
print $outputFH $input, "\n";
}
} else {
print $outputFH $input, "\n";
}
}
}
##############################
# POD
##############################
#=pod
=head SYNOPSIS
Summary:
xxxxxx.pl - generates a consensus for a specified gene in a specified taxa
Usage:
perl xxxxxx.pl [options]
=head OPTIONS
Options:
--verbose
--help
=cut
sbatch sbatchCmds/fastqc.sh
rm output/fastqc/*.zip
for inFile in output/fastqc/*/fastqc_data.txt
do
perl ~/scripts/splitFastqcData.pl -i ${inFile}
done
adapter_file_dirs <- list.dirs(
path = "output/fastqc",
full.names = TRUE,
recursive = FALSE
)
to_plot <- tibble(
file = c(
"Adapter_Content.txt",
"Per_base_sequence_quality.txt"
),
column = c(
"Nextera_Transposase_Sequence",
"Mean"
)
)
for (i in 1:nrow(to_plot)) {
temp_data <- NULL
for (data_dir in adapter_file_dirs) {
temp_data <- read_delim(paste(data_dir,
"/split_data/",
to_plot$file[i],
sep = ""),
delim = "\t",
col_names = TRUE,
show_col_types = FALSE) %>%
rename_all(~ str_replace_all(., " ", "_")) %>%
rename_all(~ str_remove(., "[#']")) %>%
select(Base, to_plot$column[i]) %>%
mutate(Sample = str_remove(data_dir, "_001_fastqc") %>%
str_remove(".+\\/")) %>%
mutate(Read = str_remove(Sample, ".+_")) %>%
mutate(Tech = str_remove(Sample, "_.+")) %>%
bind_rows(temp_data)
}
max_y <- ceiling(max(temp_data[[to_plot$column[i]]]) * 1.1)
plot_fastqc <- ggplot(temp_data,
aes(x = Base,
y = get(to_plot$column[i]),
group = Sample,
color = Sample)) +
geom_line() +
geom_point() +
facet_wrap(~Tech, ncol = 1) +
ylim(0, max_y) +
ylab(to_plot$column[i]) +
theme(legend.position = "none") +
ggtitle(str_remove(to_plot$file[i], ".txt"))
png(
filename = paste("output/figures/fastqc_",
str_remove(to_plot$file[i], ".txt"),
".png",
sep = ""),
width = 7000,
height = 3000,
res = 300)
print(plot_fastqc)
dev.off()
}
input_reads <- read_tsv(list.files(path = "output/fastqc",
recursive = TRUE,
pattern = "Basic_Statistics.txt",
full.names = TRUE),
id = "File",
show_col_types = FALSE) %>%
filter(`#Measure` == "Total Sequences") %>%
select(-`#Measure`) %>%
mutate(
File = str_remove(File, "output/fastqc/") %>%
str_remove("_001_fastqc/split_data/Basic_Statistics.txt"),
Sample = str_remove(File, "_S[0-9].+"),
Lane = str_remove(File, ".+_L00") %>%
str_remove("_R[12]"),
Read = str_replace(File, ".+_R", "R"),
Value = as.numeric(Value)) %>%
rename(Reads = Value)
ggplot(input_reads, aes(x = Sample, y = Reads / 1000000, fill = Lane)) +
geom_col() +
facet_wrap(~Read, ncol = 1) +
ylab("Reads (in millions)") +
xlab("") +
ggtitle("Sequenced Reads Per Sample") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("output/figures/InputReads.png",
height = 3000,
width = 4000,
units = "px"
)
#!/bin/sh
#SBATCH --account=gdkendalllab
#SBATCH --array=0-17
#SBATCH --error=slurmOut/align-%j.txt
#SBATCH --output=slurmOut/align-%j.txt
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
#SBATCH --job-name align
#SBATCH --wait
#SBATCH --mail-user=matthew.cannon@nationwidechildrens.org
#SBATCH --mail-type=FAIL,REQUEUE,TIME_LIMIT_80
set -e ### stops bash script if line ends with error
echo ${HOSTNAME} ${SLURM_ARRAY_TASK_ID}
module purge
module load GCC/9.3.0 \
SAMtools/1.10 \
HISAT2/2.2.1
nameArray=(
her3_38aa_1_24hpf \
her3_38aa_1_72hpf \
her3_38aa_2_24hpf \
her3_38aa_2_72hpf \
her3_38aa_3_24hpf \
her3_38aa_3_72hpf \
her3-MO_1_24hpf \
her3-MO_2_24hpf \
her3-MO_3_24hpf \
mm-MO_1_24hpf \
mm-MO_2_24hpf \
mm-MO_3_24hpf \
WIK_1_24hpf \
WIK_1_72hpf \
WIK_2_24hpf \
WIK_2_72hpf \
WIK_3_24hpf \
WIK_3_72hpf
)
baseName=${nameArray[${SLURM_ARRAY_TASK_ID}]}
inputPath=/home/gdkendalllab/lab/raw_data/fastq/2021_10_07
# Make a variable that had comma-delimited list of input files
R1=$(ls ${inputPath}/${baseName}/*R1*fastq.gz | \
perl -pe 's/\n/,/' | \
perl -pe 's/,$//')
R2=$(ls ${inputPath}/${baseName}/*R2*fastq.gz | \
perl -pe 's/\n/,/' | \
perl -pe 's/,$//')
hisat2 \
-x /home/gdkendalllab/lab/references/hisat2/danRer11_plusERCC \
-1 ${R1} \
-2 ${R2} \
-k 1 \
-p 8 \
--summary-file output/aligned/${baseName}Summary.txt | \
samtools fixmate -@ 4 -m - - | \
samtools sort -@ 4 -O BAM | \
samtools markdup -@ 4 -r - - | \
samtools view -@ 4 -f 2 -b - \
> output/aligned/${baseName}.bam
samtools index output/aligned/${baseName}.bam
sbatch sbatchCmds/align.sh
#!/bin/sh
#SBATCH --account=gdkendalllab
#SBATCH --array=0-17
#SBATCH --error=slurmOut/countAligned.txt
#SBATCH --output=slurmOut/countAligned.txt
#SBATCH --mem=10G
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=2
#SBATCH --job-name countAligned
#SBATCH --wait
#SBATCH --time=1-00:00:00
module load GCC/9.3.0 \
SAMtools/1.10
fileArray=(output/aligned/*.bam)
for i in "${!fileArray[@]}"
do
fileArray[$i]="${fileArray[$i]##output/aligned/}"
fileArray[$i]="${fileArray[$i]%.bam}"
done
samtools view output/aligned/${fileArray[${SLURM_ARRAY_TASK_ID}]}.bam | \
cut -f 1 | \
perlUnique.pl | \
wc -l \
> output/aligned/counts/${fileArray[${SLURM_ARRAY_TASK_ID}]##*/}_counts.txt
sbatch sbatchCmds/countAligned.sh
## Submitted batch job 2434953
aligned_counts <-
read_tsv(list.files(path = "output/aligned/counts",
pattern = "_counts.txt",
full.names = TRUE),
id = "Sample",
col_names = "aligned_reads",
show_col_types = FALSE) %>%
mutate(Sample = str_remove(Sample, "output/aligned/counts/") %>%
str_remove("_counts.txt"))
ggplot(aligned_counts, aes(x = Sample, y = aligned_reads / 1000000)) +
geom_col() +
labs(x = "",
y = "Reads Aligned (in millions)",
title = "Number of Reads Aligned to the Genome") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggsave("output/figures/CountAligned.png",
height = 3000,
width = 4000,
units = "px")
#!/bin/sh
#SBATCH --account=gdkendalllab
#SBATCH --error=slurmOut/count-%j.txt
#SBATCH --output=slurmOut/count-%j.txt
#SBATCH --nodes=1
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=10
#SBATCH --job-name count
#SBATCH --wait
#SBATCH --mail-user=matthew.cannon@nationwidechildrens.org
#SBATCH --mail-type=FAIL,REQUEUE,TIME_LIMIT_80
set -e ### stops bash script if line ends with error
echo ${HOSTNAME} ${SLURM_ARRAY_TASK_ID}
module purge
module load GCC/9.3.0 \
SAMtools/1.10
featureCounts \
-T 10 \
-a ~/analyses/kendall/refs/annotations/danRer11.ensGene_WithERCC.gtf.gz \
-o output/geneCounts/combined.txt \
-p \
--countReadPairs \
-s 0 \
output/aligned/*.bam
sbatch sbatchCmds/countGenes.sh
genic_reads <- read_tsv("output/geneCounts/combined.txt.summary",
comment = "#",
show_col_types = FALSE) %>%
rename_with(~ str_remove(.x, "output/aligned/")) %>%
rename_with(~ str_remove(.x, ".bam")) %>%
pivot_longer(-Status,
names_to = "Sample",
values_to = "Reads") %>%
filter(Status == "Assigned" |
Status == "Unassigned_NoFeatures" |
Status == "Unassigned_Ambiguity") %>%
mutate(Status = str_replace(
Status,
"Unassigned_NoFeatures",
"Non-genic") %>%
str_replace("Unassigned_Ambiguity", "Multiple Features")) %>%
group_by(Sample) %>%
mutate(Percent_Reads = Reads / sum(Reads) * 100)
ggplot(genic_reads, aes(x = Sample,
y = Percent_Reads,
fill = Status)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_discrete(name = "Read Context") +
xlab("") +
ylab("Percent of reads for each sample") +
ggtitle("Assignment of Sequencing Reads to Genes") +
scale_fill_brewer(type = "qual", palette = "Set2")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
ggsave("output/figures/geneAssignmentPlot.png",
width = 3000,
height = 2000,
units = "px",
device = "png")
reads_latw <- input_reads %>%
filter(Read == "R1") %>%
group_by(Sample) %>%
summarize(Sequenced_Reads = sum(Reads)) %>%
full_join(aligned_counts, by = "Sample") %>%
rename(Uniquely_Aligned_Reads = aligned_reads) %>%
full_join(genic_reads %>%
filter(Status == "Assigned") %>%
rename(Genic_Reads = Reads) %>%
select(Sample, Genic_Reads),
by = "Sample") %>%
pivot_longer(-Sample,
names_to = "Stage",
values_to = "Reads") %>%
mutate(Stage = str_replace_all(Stage, "_", " ") %>%
fct_relevel("Sequenced Reads",
"Uniquely Aligned Reads",
"Genic Reads"))
reads_latw %>%
pivot_wider(names_from = Stage, values_from = Reads) %>%
mutate(Condition = Sample %>%
str_replace("her3_38aa.+", "Her3 38aa knockout") %>%
str_replace("her3-MO.+", "Her3 morpholino") %>%
str_replace("mm-MO.+", "Mismatched morpholino control") %>%
str_replace("WIK.+", "WIK control"),
Collection_Time = str_remove(Sample, ".+_")) %>%
arrange(Condition, Collection_Time) %>%
select(Sample,
Condition,
Collection_Time,
`Sequenced Reads`,
`Uniquely Aligned Reads`,
`Genic Reads`) %>%
rename(Sample_Name = Sample) %>%
rename_with(~ str_replace(.x, "_", " ")) %>%
gt() %>%
fmt_number(columns = c("Sequenced Reads",
"Uniquely Aligned Reads",
"Genic Reads"),
decimals = 0,
sep_mark = ",") %>%
gtsave("output/figures/sampleReadCounts.png")
ggplot(reads_latw, aes(x = Sample, y = Reads / 1000000, fill = Stage)) +
geom_col(position = "dodge") +
ylab("Reads (in millions)") +
xlab("") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
ggtitle("Number of Reads Per Sample at Each Stage") +
scale_fill_brewer(type = "qual", palette = "Set2")
ggsave("output/figures/ReadsLostAlongTheWay.png",
width = 3000,
height = 2000,
units = "px",
device = "png")
read_tsv("output/geneCounts/combined.txt",
show_col_types = FALSE,
comment = "#") %>%
filter(Geneid == "ENSDARG00000076857.4") %>%
select(starts_with("output")) %>%
rename_with(~ str_remove(.x, "output/aligned/")) %>%
rename_with(~ str_remove(.x, ".bam")) %>%
pivot_longer(cols = everything(),
names_to = "Sample",
values_to = "Her3") %>%
mutate(Group = str_replace(Sample, "_[123]_", "_")) %>%
ggplot(aes(x = Group,
y = Her3,
fill = Group)) +
geom_boxplot() +
geom_point(color = "black") +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
ggsave("output/figures/unNormHer3.png",
width = 3000,
height = 2000,
units = "px",
device = "png")
read_tsv("output/geneCounts/combined.txt",
show_col_types = FALSE,
comment = "#") %>%
select(starts_with("output")) %>%
rename_with(~ str_remove(.x, "output/aligned/")) %>%
rename_with(~ str_remove(.x, ".bam")) %>%
mutate(gene_sum = rowSums(across())) %>%
filter(gene_sum > 10) %>%
select(-gene_sum) %>%
GGally::ggpairs(progress = FALSE)
ggsave("output/figures/unNormPairs.png",
width = 8000,
height = 8000,
units = "px",
device = "png")
group_table <- tibble(samples = list.files(path = "output/aligned",
pattern = ".bam$")) %>%
mutate(samples = str_remove(samples, "output/aligned/") %>%
str_remove(".bam")) %>%
mutate(group = str_replace(samples, "_[123]_", "_"))
group_list <- group_table %>%
select(group) %>%
distinct() %>%
pull(group)
gene_names <- read_tsv("/gpfs0/home1/gdlessnicklab/mvc002/analyses/kendall/refs/zebrafish_gene_info.txt.gz",
show_col_types = FALSE) %>%
rename(Gene = `Gene stable ID version`) %>%
select(-`Transcript stable ID version`,
-`Transcript stable ID`,
-`Transcript type`,
-`Gene type`,
-`Gene stable ID`) %>%
mutate(Gene = str_remove(Gene, "\\.[0-9]+")) %>%
unique() %>%
rename(gene_name = `Gene name`)
# Loop through all combinations of groups
for (i in seq_len(length(group_list) - 1)) {
for (j in (i + 1):length(group_list)) {
group_1 <- group_list[i]
group_2 <- group_list[j]
# Get the samples in each group
samples <- group_table %>%
filter(group == group_1 | group == group_2) %>%
select(samples) %>%
pull(samples)
groups <- group_table %>%
filter(group == group_1 | group == group_2) %>%
select(group) %>%
pull(group)
gene_expr <- read_tsv("output/geneCounts/combined.txt",
col_names = TRUE,
comment = "#",
show_col_types = FALSE) %>%
select(-Chr, -Start, -End, -Strand, -Length) %>%
rename_all(~ str_remove(., "output/aligned/")) %>%
rename_all(~ str_remove(., "_.bam")) %>%
rename_all(~ str_remove(., ".R.bam")) %>%
rename_all(~ str_remove(., ".bam")) %>%
rename_all(~ str_remove(., ".+/")) %>%
select(all_of(samples), Geneid) %>%
pivot_longer(cols = !Geneid,
names_to = "Sample",
values_to = "Reads") %>%
mutate(Geneid = str_remove(Geneid, "\\.[0-9]+")) %>%
group_by(Geneid, Sample) %>%
summarize(Reads = sum(Reads)) %>%
ungroup() %>%
pivot_wider(names_from = Sample, values_from = Reads) %>%
column_to_rownames(var = "Geneid")
### Back to our regularly scheduled program
gene_expr <- gene_expr %>%
rownames_to_column("gene") %>%
filter(!grepl("ERCC", gene)) %>%
column_to_rownames(var = "gene")
data_dge <- DGEList(counts = gene_expr, group = groups)
kept_df <- filterByExpr(data_dge, min.count = 0.1)
data_dge <- data_dge[kept_df, keep.lib.sizes = FALSE] %>%
calcNormFactors(.) %>%
estimateDisp(., model.matrix(~ groups))
dim(data_dge$counts)
tested <- exactTest(data_dge)
results <- topTags(tested, n = Inf)
results$table$Gene <- rownames(results$table)
g1 <- paste(group_1, "mean", sep = "_")
g2 <- paste(group_2, "mean", sep = "_")
summarized_data <- cpm(data_dge) %>%
as.data.frame() %>%
rownames_to_column(var = "Gene") %>%
as_tibble() %>%
full_join(cpmByGroup(data_dge) %>%
as.data.frame() %>%
rename_all(~ str_c(.x, "_mean")) %>%
rownames_to_column("Gene") %>%
as_tibble(),
by = "Gene") %>%
full_join(results$table, by = "Gene") %>%
left_join(gene_names, by = "Gene") %>%
rowwise() %>%
mutate(signif = FDR <= 0.1 &
abs(logFC) >= 1.5)
write.table(summarized_data,
file = paste("output/edgeROut/edgeR_",
group_1,
"_",
group_2,
".txt",
sep = ""),
quote = FALSE,
sep = "\t",
row.names = FALSE)
# Plotting ########################
paste_end <- function(x) {
paste(x, "_", group_1, "_", group_2, ".png", sep = "")
}
### logFC histogram
hist_plot_fc <- ggplot(summarized_data, aes(x = logFC)) +
geom_vline(xintercept = 0, color = "red") +
geom_histogram(bins = 100) +
ggtitle(paste("Histogram of logFC\n",
group_1,
"\n",
group_2,
sep = ""))
ggsave(filename = paste_end("output/figures/edgeR/logFCHistogram"),
plot = hist_plot_fc,
width = 2000,
height = 2000,
units = "px",
device = "png")
hist_plot_pval <- ggplot(summarized_data, aes(x = PValue)) +
geom_histogram(bins = 100) +
ggtitle(paste("Histogram of p-values\n",
group_1,
"\n",
group_2,
sep = ""))
ggsave(paste_end("output/figures/edgeR/pvalueHistogram"),
plot = hist_plot_pval,
width = 2000,
height = 2000,
units = "px",
device = "png")
hist_plot_fdr <- ggplot(summarized_data, aes(x = FDR)) +
geom_histogram(bins = 100) +
ggtitle(paste("Histogram of FDR values\n",
group_1,
"\n",
group_2,
sep = ""))
ggsave(paste_end("output/figures/edgeR/fdrHistogram"),
plot = hist_plot_fdr,
width = 2000,
height = 2000,
units = "px",
device = "png")
picked_genes <- summarized_data %>%
ungroup() %>%
arrange(abs(logFC) * -1) %>%
filter(grepl("^si:|^zgc:|^CABZ0|^C[uU][0-9]|^C[rR][0-9]|^Zgc:|^F[pP][0-9]|^BX[0-9]|^CT[0-9]|^FO[0-9]|^L0[0-9]",
gene_name) == FALSE) %>%
slice_head(n = 40) %>%
mutate(gene_name = str_to_title(gene_name)) %>%
select(logFC, PValue, gene_name) %>%
distinct(gene_name, .keep_all = TRUE)
volcano_plot <- ggplot(summarized_data,
aes(x = logFC * -1, # reverse x axis
y = -log10(PValue),
color = signif)) +
geom_point(alpha = 0.5) +
ggtitle(paste("Volcano plot\n",
"FDR <= 0.1 & abs(logFC) >= 1.5 denoted in orange\n",
group_1,
"\nvs\n",
group_2,
sep = "")) +
ggrepel::geom_text_repel(data = picked_genes,
aes(x = logFC * -1, # reverse x axis
y = -log10(PValue),
label = gene_name),
color = "black",
min.segment.length = 0,
arrow = arrow(length = unit(0.01, "npc")),
size = 2.8,
max.overlaps = 20,
force_pull = 0.01) +
scale_color_brewer(type = "qual",
name = "Significant",
palette = "Set2") +
theme(legend.position = "none")
assign(paste("volcano_plot_",
group_1,
"_",
group_2,
sep = ""),
volcano_plot)
ggsave(paste_end("output/figures/edgeR/volcanoPlot"),
plot = volcano_plot,
width = 2000,
height = 2000,
units = "px",
device = "png")
scatter_plot_log <- ggplot(summarized_data,
aes(x = !!sym(paste(group_1, "_mean", sep = "")),
y = !!sym(paste(group_2, "_mean", sep = "")),
color = signif)) +
geom_point(alpha = 0.2) +
geom_abline(slope = 1, intercept = 0) +
ggtitle(paste("Scatterplot of mean CPM values\n",
group_1,
"\n",
group_2,
sep = "")) +
scale_y_log10() +
scale_x_log10() +
scale_color_brewer(type = "qual",
name = "Significant",
palette = "Set2")
ggsave(paste_end("output/figures/edgeR/logGroupScatter"),
plot = scatter_plot_log,
width = 2000,
height = 2000,
units = "px",
device = "png")
scatter_plot <- ggplot(summarized_data,
aes(x = !!sym(paste(group_1, "_mean", sep = "")),
y = !!sym(paste(group_2, "_mean", sep = "")),
color = signif)) +
geom_point(alpha = 0.2) +
geom_abline(slope = 1, intercept = 0) +
ggtitle(paste("Scatterplot of mean CPM values\n",
group_1,
" ",
group_2,
sep = "")) +
scale_color_brewer(type = "qual",
name = "Significant",
palette = "Set2")
ggsave(paste_end("output/figures/edgeR/groupScatter"),
plot = scatter_plot,
width = 2000,
height = 2000,
units = "px",
device = "png")
}
}
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: ggrepel: 6 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: ggrepel: 7 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: ggrepel: 1 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
combined_data <- NULL
for (in_file in list.files(path = "output/edgeROut",
pattern = "^edgeR_.+",
full.names = TRUE)) {
combined_data <- read_tsv(in_file,
col_select = c("signif", "logFC"),
show_col_types = FALSE) %>%
mutate(group_1 = str_remove(in_file, ".+edgeR_") %>%
str_remove(".txt") %>%
str_replace("hpf.+", "hpf"),
group_2 = str_remove(in_file, ".+edgeR_") %>%
str_remove(".txt") %>%
str_remove(".+hpf_")) %>%
group_by(group_1, group_2) %>%
summarize(n = n(),
n_de = sum(signif == "TRUE"),
de_label = paste(sum(signif == TRUE & logFC > 0),
" / ",
sum(signif == TRUE & logFC < 0),
sep = ""),
.groups = "drop") %>%
bind_rows(combined_data)
}
ggplot(combined_data,
aes(x = group_1,
y = group_2,
label = de_label,
fill = n_de)) +
geom_tile() +
geom_label(fill = "white", label.size = NA) +
labs(x = "",
y = "",
title = "Number of differentially expressed genes\n(up-regulated/down-regulated)",
fill = "DE genes") +
scale_fill_gradient(low = "#fcfbfd", high = "#4a1486") +
theme(text = element_text(size = 20))
ggsave("output/figures/deTilePlot.png",
width = 2800,
height = 2400,
units = "px",
device = "png")
tile_plot_nomo <- combined_data %>%
mutate(group_1 = str_replace_all(group_1, "_", " ") %>%
str_replace("her3", "Her3"),
group_2 = str_replace_all(group_2, "_", " ") %>%
str_replace("her3", "Her3")) %>%
filter(!grepl("-MO", group_1) & !grepl("-MO", group_2)) %>%
ggplot(aes(x = group_1,
y = group_2,
label = de_label,
fill = n_de)) +
geom_tile() +
geom_label(fill = "white",
label.size = NA,
size = 2) +
labs(x = "",
y = "",
title = "Number of differentially expressed genes\n(up-regulated/down-regulated)",
fill = "DE genes") +
scale_fill_gradient(low = "#fcfbfd", high = "#4a1486") +
theme(text = element_text(size = 10))
ggsave("output/figures/deTilePlot_noMo.png",
plot = tile_plot_nomo,
width = 2200,
height = 1800,
units = "px",
device = "png")
gene_data <- read_tsv("output/geneCounts/combined.txt",
col_names = TRUE,
comment = "#",
show_col_types = FALSE) %>%
select(-Chr,
-Start,
-End,
-Strand,
-Length) %>%
rename_all(~ str_remove(., "output/aligned/")) %>%
rename_all(~ str_remove(., "_.bam")) %>%
rename_all(~ str_remove(., ".R.bam")) %>%
rename_all(~ str_remove(., ".bam")) %>%
rename_all(~ str_remove(., ".+/")) %>%
select(matches("WIK_[123]_24hpf"),
matches("her3_38aa_[123]_24hpf"),
Geneid) %>%
pivot_longer(cols = !Geneid,
names_to = "Sample",
values_to = "Reads") %>%
mutate(Geneid = str_remove(Geneid, "\\.[0-9]+")) %>%
group_by(Geneid, Sample) %>%
summarize(Reads = sum(Reads)) %>%
ungroup() %>%
pivot_wider(names_from = Sample, values_from = Reads) %>%
as_tibble() %>%
column_to_rownames(var = "Geneid") %>%
DGEList(counts = ., group = c(1, 1, 1, 2, 2, 2)) %>%
calcNormFactors(.) %>%
estimateDisp(., model.matrix(~ c(1, 1, 1, 2, 2, 2)))
## `summarise()` has grouped output by 'Geneid'. You can override using the `.groups` argument.
gene_expr <- exactTest(gene_data) %>%
topTags(., n = Inf) %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
full_join(read_tsv("misc/ERCC_expected_output.txt",
show_col_types = FALSE) %>%
rename(gene = `ERCC ID`,
exp_log2 = `log2(Mix 1/Mix 2)`,
mix_1 = `concentration in Mix 1 (attomoles/ul)`,
mix_2 = `concentration in Mix 2 (attomoles/ul)`)) %>%
full_join(cpmByGroup(gene_data) %>%
as.data.frame() %>%
rownames_to_column("gene") %>%
rename(her3_mean = `1`,
wik_mean = `2`)) %>%
mutate(de_exp = exp_log2 != 0,
de_meas = FDR <= 0.1)
## Joining, by = "gene"
## Joining, by = "gene"
cor.test(gene_expr$mix_1, gene_expr$wik_mean)
##
## Pearson's product-moment correlation
##
## data: gene_expr$mix_1 and gene_expr$wik_mean
## t = 57.256, df = 90, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9796910 0.9911023
## sample estimates:
## cor
## 0.9865496
gene_expr %>%
filter(grepl("ERCC", gene)) %>%
ggplot(aes(x = mix_1, y = wik_mean)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
gene_expr %>%
filter(grepl("ERCC", gene)) %>%
ggplot(aes(x = mix_2, y = her3_mean)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
ggplot(gene_expr, aes(x = logFC,
y = -log10(PValue),
color = de_exp)) +
geom_point(alpha = 0.5)
ggplot(gene_expr, aes(x = logFC)) +
geom_histogram(bins = 500)
gene_expr %>%
filter(grepl("ERCC", gene)) %>%
ggplot(aes(x = exp_log2,
y = logFC,
color = FDR <= 0.1)) +
geom_point(alpha = 0.5)
gene_expr %>%
filter(grepl("ERCC", gene)) %>%
ggplot(aes(x = mix_1 / mix_2,
y = wik_mean / her3_mean,
color = FDR <= 0.1)) +
geom_point(alpha = 0.5)
## Warning: Removed 3 rows containing missing values (geom_point).
gene_expr %>%
filter(grepl("ERCC", gene)) %>%
rowwise() %>%
mutate(min_mix = max(her3_mean, wik_mean)) %>%
ggplot(aes(x = min_mix,
y = logFC,
color = de_meas)) +
geom_rect(aes(xmin = 0.1,
xmax = 5000,
ymin = 1.5,
ymax = 4),
fill = "#78fd7e",
color = NA,
alpha = 0.01) +
geom_rect(aes(xmin = 0.1,
xmax = 5000,
ymin = -1.5,
ymax = -4),
fill = "#78fd7e",
color = NA,
alpha = 0.01) +
geom_jitter() +
scale_x_log10() +
geom_vline(xintercept = 0.1) +
geom_hline(yintercept = c(-1.5, 1.5)) +
ggtitle("Potential FC (1.5?) and max CPM (0.1?) for DE\nFaceted by expected log FC") +
facet_wrap(~ exp_log2)
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 3 rows containing missing values (geom_point).
group_table <- tibble(samples = list.files(path = "output/aligned",
pattern = ".bam$")) %>%
mutate(samples = str_remove(samples, "output/aligned/") %>%
str_remove(".bam")) %>%
mutate(group = str_replace(samples, "_[123]_", "_") %>%
str_replace_all("_", " ") %>%
str_replace("her3", "Her3") %>%
fct_relevel("Her3 38aa 24hpf",
"Her3 38aa 72hpf",
"WIK 24hpf",
"WIK 72hpf",
"Her3-MO 24hpf",
"mm-MO 24hpf"))
group_list <- group_table %>%
select(group) %>%
distinct() %>%
pull(group)
gene_expr <- read_tsv("output/geneCounts/combined.txt",
col_names = TRUE,
comment = "#",
show_col_types = FALSE) %>%
select(-Chr,
-Start,
-End,
-Strand,
-Length) %>%
rename_all(~ str_remove(., "output/aligned/")) %>%
rename_all(~ str_remove(., "_.bam")) %>%
rename_all(~ str_remove(., ".R.bam")) %>%
rename_all(~ str_remove(., ".bam")) %>%
rename_all(~ str_remove(., ".+/")) %>%
pivot_longer(cols = !Geneid,
names_to = "Sample",
values_to = "Reads") %>%
mutate(Geneid = str_remove(Geneid, "\\.[0-9]+")) %>%
group_by(Geneid, Sample) %>%
summarize(Reads = sum(Reads), .groups = "drop") %>%
ungroup() %>%
pivot_wider(names_from = Sample, values_from = Reads) %>%
column_to_rownames(var = "Geneid")
data_dge <- DGEList(counts = gene_expr, group = group_table$group) %>%
calcNormFactors(.) %>%
estimateDisp(., model.matrix(~ group_table$group))
kept_df <- filterByExpr(data_dge, min.count = 0.1)
data_dge <- data_dge[kept_df, keep.lib.sizes = FALSE] %>%
calcNormFactors(.) %>%
estimateDisp(., model.matrix(~ group_table$group))
dim(data_dge$counts)
## [1] 25610 18
gene_counts <- cpm(data_dge) %>%
as.data.frame() %>%
t() %>%
as.data.frame() %>%
select(!starts_with("ERCC"))
pca <- prcomp(gene_counts)
write.table(gene_counts,
"output/geneCounts/normGeneCounts.txt",
row.names = TRUE,
quote = FALSE,
sep = "\t")
variance <- (pca$sdev)^2 / sum(pca$sdev^2) * 100
names(variance) <- colnames(pca$x)
plot(variance)
prin_comps <- pca$x %>%
as.data.frame() %>%
rownames_to_column(var = "samples") %>%
as_tibble() %>%
full_join(group_table)
## Joining, by = "samples"
plot_colors <- scales::brewer_pal(type = "qual", palette = "Set2")(6)
names(plot_colors) <- levels(prin_comps$group)
pca_all <- ggplot(prin_comps, aes(x = PC1,
y = PC2,
color = group)) +
geom_point(size = 3) +
ggtitle("PCA Plot for all Samples") +
ggrepel::geom_text_repel(aes(label = samples %>%
str_replace_all("_", " ") %>%
str_replace("her3", "Her3")),
size = 3) +
xlab(paste("PC1 (",
round(variance[1], 1),
"%)",
sep = "")) +
ylab(paste("PC2 (",
round(variance[2], 1),
"%)",
sep = "")) +
scale_color_manual(values = plot_colors,
name = "") +
theme(legend.position = c(0.9, 0.2),
legend.text = element_text(size = 6))
ggsave("output/figures/allSamplePCA.png",
plot = pca_all,
width = 2500,
height = 2000,
units = "px",
device = "png")
for (timepoint in c("24hpf", "72hpf")) {
sub_pca <- gene_counts %>%
as.data.frame() %>%
rownames_to_column(var = "samples") %>%
filter(grepl(timepoint, samples),
grepl("-MO", samples) == FALSE) %>%
column_to_rownames(var = "samples") %>%
prcomp()
sub_prin_comps <- sub_pca$x %>%
as.data.frame() %>%
rownames_to_column(var = "samples") %>%
as_tibble() %>%
left_join(group_table)
variance <- (sub_pca$sdev)^2 / sum(sub_pca$sdev^2) * 100
names(variance) <- colnames(sub_pca$x)
plot(variance)
sub_plot_colors <- plot_colors[sub_prin_comps$group %>%
droplevels() %>%
levels()]
sub_plot <- ggplot(sub_prin_comps, aes(x = PC1,
y = PC2,
color = group)) +
geom_point(size = 3) +
ggtitle(timepoint) +
ggrepel::geom_text_repel(aes(label = samples %>%
str_replace_all("_", " ") %>%
str_replace("her3", "Her3")),
size = 3) +
xlab(paste("PC1 (",
round(variance[1], 1),
"%)",
sep = ""
)) +
ylab(paste("PC2 (",
round(variance[2], 1),
"%)",
sep = ""
)) +
theme(legend.position = "none") +
scale_color_manual(values = sub_plot_colors)
ggsave(paste("output/figures/",
timepoint,
"PCA.png",
sep = ""),
sub_plot,
width = 2500,
height = 2000,
units = "px",
device = "png"
)
assign(paste("pca_", timepoint, sep = ""), sub_plot)
}
## Joining, by = "samples"
## Joining, by = "samples"
layout_matrix <- matrix(c(1,1,1,1,1,1,
1,1,1,1,1,1,
1,1,1,1,1,1,
1,1,1,1,1,1,
1,1,1,1,1,1,
2,2,2,3,3,3,
2,2,2,3,3,3,
2,2,2,3,3,3),
ncol = 6,
byrow = TRUE)
png("output/figures/PCA_grid.png",
width = 2500,
height = 2500,
res = 300)
gridExtra::grid.arrange(pca_all,
pca_24hpf,
pca_72hpf,
layout_matrix = layout_matrix)
dev.off()
## png
## 2
#### Heatmap of all samples except Morpholino
group_df <- data.frame(Group = rownames(gene_counts) %>%
str_replace("_[123]_",
"_") %>%
str_replace_all("_", " ") %>%
str_replace("her3", "Her3"),
row.names = rownames(gene_counts) %>%
str_replace_all("_", " ") %>%
str_replace("her3", "Her3"))
heatmap_noMo <-
gene_counts %>%
rownames_to_column("group") %>%
mutate(group = str_replace(group, "her3", "Her3") %>%
str_replace_all("_", " ")) %>%
column_to_rownames(var = "group") %>%
t() %>%
as.data.frame() %>%
select(., !contains("-MO")) %>%
mutate(rowsums = rowSums(.)) %>%
filter(rowsums > 0) %>%
select(-rowsums) %>%
as.matrix() %>%
pheatmap::pheatmap(.,
color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7,
name = "PuOr")))(100),
scale = "row",
show_rownames = FALSE,
annotation_col = group_df,
annotation_colors = list(Group = plot_colors[grep("-MO",
names(plot_colors),
invert = TRUE)
]),
annotation_names_col = FALSE,
fontsize = 8,
fontsize_number = 5,
annotation_legend = FALSE)
png("output/figures/heatmap_noMO.png",
width = 2000,
height = 3000,
res = 300)
print(heatmap_noMo)
dev.off()
## png
## 2
only_24h <- gene_counts[grepl("24hpf", rownames(gene_counts)), ] %>%
scale() %>%
t() %>%
na.omit()
#### PCAs, plus heatmap of all samples except Morpholino
### Also including volcano plots from above as well as the DE tile plot
layout_matrix_2 = matrix(c(1,1,1,1,1,1,4,4,4,4,
1,1,1,1,1,1,4,4,4,4,
1,1,1,1,1,1,4,4,4,4,
1,1,1,1,1,1,4,4,4,4,
1,1,1,1,1,1,4,4,4,4,
2,2,2,3,3,3,4,4,4,4,
2,2,2,3,3,3,4,4,4,4,
2,2,2,3,3,3,4,4,4,4,
5,5,5,6,6,6,7,7,7,7,
5,5,5,6,6,6,7,7,7,7,
5,5,5,6,6,6,7,7,7,7),
ncol = 10,
byrow = TRUE)
png("output/figures/PCA_grid_plus_heatmap.png",
width = 3500,
height = 3500,
res = 300)
gridExtra::grid.arrange(grobs = list(pca_all +
ggtitle(""),
pca_24hpf,
pca_72hpf,
heatmap_noMo[[4]],
volcano_plot_her3_38aa_24hpf_WIK_24hpf +
ggtitle("24hpf"),
volcano_plot_her3_38aa_72hpf_WIK_72hpf +
ggtitle("72hpf"),
tile_plot_nomo + ggtitle("")),
layout_matrix = layout_matrix_2)
## Warning: Removed 1 rows containing missing values (geom_text_repel).
## Warning: ggrepel: 21 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning: ggrepel: 16 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
dev.off()
## png
## 2
png("output/figures/heatmap_24hpf.png",
width = 1500,
height = 2500)
pheatmap::pheatmap(only_24h,
color = colorRampPalette(rev(RColorBrewer::brewer.pal(n = 7,
name = "PuOr")))(100),
show_rownames = FALSE,
scale = "none",
annotation_col =
data.frame(Group = rownames(gene_counts),
row.names = rownames(gene_counts)) %>%
mutate(Group = str_replace(Group,
"_[123]_",
"_") %>%
str_replace_all("_", " ")) %>%
filter(grepl("24hpf", Group)),
annotation_names_col = FALSE,
fontsize = 30)
dev.off()
## png
## 3
read.delim("output/geneCounts/normGeneCounts.txt") %>%
rownames_to_column("samples") %>%
as_tibble() %>%
select(samples, ENSDARG00000076857) %>%
mutate(group = str_replace(samples, "_[123]_", "_") %>%
str_replace_all("_", " ")) %>%
ggplot(aes(x = group,
y = ENSDARG00000076857)) +
geom_boxplot() +
geom_jitter() +
ggtitle("Her3 Normalized Counts Per Group") +
labs(x = "", y = "Her3 Normalized Expression") +
theme(legend.position = "none")
ggsave("output/figures/her3norm.png",
width = 2500,
height = 2000,
units = "px",
device = "png")
norm_data <- read.delim("output/geneCounts/normGeneCounts.txt") %>%
rownames_to_column("samples") %>%
as_tibble() %>%
mutate(samples = str_replace_all(samples, "_", " ")) %>%
column_to_rownames("samples") %>%
t() %>%
as.data.frame()
scatter_plot <- GGally::ggpairs(norm_data, progress = FALSE)
ggsave("output/figures/normPairs.png",
plot = scatter_plot,
width = 8000,
height = 8000,
units = "px",
device = "png")
scatter_plot2 <- norm_data %>%
select(!(contains("-MO"))) %>%
GGally::ggpairs(progress = FALSE)
ggsave("output/figures/normPairs_noMO.png",
plot = scatter_plot2,
width = 8000,
height = 8000,
units = "px",
device = "png")
in_files <- c("output/edgeROut/edgeR_her3_38aa_72hpf_WIK_72hpf.txt",
"output/edgeROut/edgeR_her3_38aa_24hpf_WIK_24hpf.txt",
"output/edgeROut/edgeR_her3-MO_24hpf_WIK_24hpf.txt",
"output/edgeROut/edgeR_her3-MO_24hpf_mm-MO_24hpf.txt")
signif_list <- list()
for (in_file_name in in_files) {
stub_name <- gsub("output/edgeROut/edgeR_", "", in_file_name) %>%
str_replace(".txt", "") %>%
str_replace_all("_", " ") %>%
str_remove(" 24hpf") %>%
str_remove(" 72hpf") %>%
str_replace(" 24", "\n24") %>%
str_replace(" 72", "\n72") %>%
str_replace("WIK", "\nWIK") %>%
str_replace("mm-MO", "\nmm-MO")
signif_list[[stub_name]] <- read_tsv(in_file_name,
col_select = c("Gene",
"gene_name",
"signif"),
show_col_types = FALSE) %>%
filter(signif == "TRUE") %>%
pull(gene_name)
}
png("output/figures/signifGeneVennDiagrams.png",
width = 3500,
height = 2500,
res = 300)
ggvenn::ggvenn(signif_list,
show_percentage = FALSE,
set_name_size = 4) +
scale_fill_brewer(type = "qual", palette = "Paired")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.
dev.off()
## png
## 2
comparisons <- list(mutant_24h_vs_72h = c("her3_38aa_24hpf_WIK_24hpf",
"her3_38aa_72hpf_WIK_72hpf",
"Mutant 24h",
"Mutant 72h"),
mutant_24h_vs_morph_24h = c("her3_38aa_24hpf_WIK_24hpf",
"her3-MO_24hpf_mm-MO_24hpf",
"Mutant 24h",
"Morpholino 24h"))
for (comparison in comparisons) {
summary_df <- read_tsv(paste("output/edgeROut/edgeR_",
comparison[[1]][1],
".txt",
sep = ""),
col_select = c(contains("_mean"),
"Gene",
"signif",
"logFC"),
show_col_types = FALSE) %>%
rename(signif_72h = signif,
logFC_24h = logFC) %>%
inner_join(read_tsv(paste("output/edgeROut/edgeR_",
comparison[[1]][2],
".txt",
sep = ""),
col_select = c(contains("_mean"),
"Gene",
"signif",
"logFC"),
show_col_types = FALSE) %>%
rename(signif_24h = signif,
logFC_72h = logFC)) %>%
select(!starts_with("WIK")) %>%
pivot_longer(cols = c(-Gene,
-signif_24h,
-signif_72h,
-logFC_24h,
-logFC_72h),
names_to = "sample",
values_to = "expression") %>%
filter(signif_24h == "TRUE" | signif_72h == "TRUE") %>%
mutate(logFC_24h = if_else(logFC_24h < 0,
"up",
"down"),
logFC_72h = if_else(logFC_72h < 0,
"up",
"down"),
direction = str_c(logFC_24h, " ", logFC_72h),
both_sig = signif_24h == TRUE & signif_72h == TRUE)
summary_df %>%
select(-expression, -sample) %>%
distinct() %>%
group_by(signif_24h, signif_72h) %>%
summarize(n = n()) %>%
as.data.frame() %>%
rename(`Significant at 24h` = signif_24h,
`Significant at 72h` = signif_72h) %>%
gt() %>%
tab_header(title = "Differentially Expressed Genes") %>%
tab_options(table_body.hlines.width = 0) %>%
gtsave(paste("output/figures/",
comparison[[1]][3],
"_vs_",
comparison[[1]][4],
"_DE_genes.png",
sep = ""))
summary_df %>%
filter(both_sig == TRUE) %>%
select(-expression, -sample) %>%
distinct() %>%
group_by(direction) %>%
summarize(n = n()) %>%
ungroup() %>%
mutate(direction = str_replace_all(direction,
"up",
"Up-regulated") %>%
str_replace_all("down", "Down-regulated")) %>%
separate(direction,
into = c(comparison[[1]][3],
comparison[[1]][4]),
sep = " ") %>%
pivot_wider(names_from = comparison[[1]][4], values_from = n) %>%
gt() %>%
tab_header(title = "Direction of Differentially Expressed Genes") %>%
tab_spanner(label = comparison[[1]][4],
columns = c(`Down-regulated`, `Up-regulated`)) %>%
tab_options(table_body.hlines.width = 0) %>%
gtsave(paste("output/figures/",
comparison[[1]][3],
"_vs_",
comparison[[1]][4],
"_DE_gene_direction.png",
sep = ""))
ggplot() +
geom_jitter(data = summary_df %>%
filter(signif_24h == TRUE &
sample == "her3_38aa_24hpf_mean"),
aes(x = sample,
y = expression),
color = "red",
width = 0.1) +
geom_jitter(data = summary_df %>%
filter(signif_72h == TRUE &
sample == "her3_38aa_72hpf_mean"),
aes(x = sample,
y = expression),
color = "red",
width = 0.1) +
geom_line(data = summary_df %>%
filter(both_sig == TRUE),
aes(x = sample,
y = expression,
group = Gene,
color = both_sig),
size = 0.5) +
scale_y_log10() +
facet_wrap(~ direction) +
ggtitle(paste("Differentially Expressed Genes in ",
comparison[[1]][3],
" vs ",
comparison[[1]][4],
sep = ""))
ggsave(paste("output/figures/linePlotsByGroup_",
comparison[[1]][3],
"_",
comparison[[1]][4],
".png",
sep = ""),
width = 15,
height = 25)
}
for (in_file in list.files(path = "output/edgeROut/", pattern = "edgeR.+txt")) {
read_tsv(paste("output/edgeROut/",
in_file,
sep = ""),
show_col_types = FALSE) %>%
select(Gene, signif) %>%
filter(signif == "TRUE") %>%
select(Gene) %>%
write_tsv(paste("output/edgeROut/",
str_replace(in_file, "edgeR", "listDeGenes"),
sep = ""))
}
#!/usr/bin/perl
use warnings;
use strict;
use Getopt::Long;
use Pod::Usage;
##############################
# By Matt Cannon
# Date: 6-22-21
# Last modified: 6-22-21
# Title: splitFastqcData.pl
# Purpose: Split fastqc_data.txt into sub-files based on modules
##############################
##############################
# Options
##############################
my $verbose;
my $help;
my $input;
my $outputDir = "";
# i = integer, s = string
GetOptions ("verbose" => \$verbose,
"help" => \$help,
"input=s" => \$input,
"outputDir=s" => \$outputDir
) or pod2usage(0) && exit;
pod2usage(1) && exit if ($help);
##############################
# Global variables
##############################
my $outputFH;
my $baseColumn;
##############################
# Code
##############################
if ($outputDir eq "") {
$input =~ /^(.+\/)/;
$outputDir = $1;
mkdir $outputDir . "/split_data"
}
##############################
### Stuff
### More stuff
open my $inputFH, "$input" or die "Could not open first input\n$!";
while (my $input = <$inputFH>){
chomp $input;
if ($input =~ /^>>END_MODULE/) {
# Close file
close $outputFH;
undef $baseColumn;
} elsif ($input =~ /^>>/) {
# Open new file
$input =~ s/^>>//;
$input =~ s/ /_/g;
$input =~ s/\t.+//;
open $outputFH, ">", $outputDir . "/split_data/" . $input . ".txt";
} elsif ($input !~ /^##/) {
# Deal with data
## Find column that has base numbering
if ($input =~ /Base|Position/ && $input =~ /^#/) {
# Change the word Position to Base to make downstream plotting easier
$input =~ s/Position/Base/;
my @dataArray = split "\t", $input;
for (my $i = 0; $i < scalar(@dataArray); $i++) {
if ($dataArray[$i] =~ /^#*Base$/) {
$baseColumn = $i;
}
}
}
# write to file
if (defined($baseColumn)) {
my @dataArray = split "\t", $input;
if ($dataArray[$baseColumn] =~ /-/) {
my ($lowerNum, $upperNum) = split "-", $dataArray[$baseColumn];
for (my $i = $lowerNum; $i <= $upperNum; $i++) {
my @tempArray = @dataArray;
$tempArray[$baseColumn] = $i;
print $outputFH join("\t", @tempArray), "\n";
}
} else {
print $outputFH $input, "\n";
}
} else {
print $outputFH $input, "\n";
}
}
}
##############################
# POD
##############################
#=pod
=head SYNOPSIS
Summary:
xxxxxx.pl - generates a consensus for a specified gene in a specified taxa
Usage:
perl xxxxxx.pl [options]
=head OPTIONS
Options:
--verbose
--help
=cut
sbatch sbatchCmds/homer.sh
for dir in output/motif/homerOut/*/
do
nameStub=${dir%/}
nameStub=${nameStub##*/}
echo $nameStub
cp $dir/knownResults.html output/motif/homerOut/${nameStub}_knownResults.html
cp $dir/homerResults.html output/motif/homerOut/${nameStub}_homerResults.html
done
## her3_38aa_24hpf_her3_38aa_72hpf
## her3_38aa_24hpf_her3-MO_24hpf
## her3_38aa_24hpf_mm-MO_24hpf
## her3_38aa_24hpf_WIK_24hpf
## her3_38aa_24hpf_WIK_72hpf
## her3_38aa_72hpf_her3-MO_24hpf
## her3_38aa_72hpf_mm-MO_24hpf
## her3_38aa_72hpf_WIK_24hpf
## her3_38aa_72hpf_WIK_72hpf
## her3-MO_24hpf_mm-MO_24hpf
## her3-MO_24hpf_WIK_24hpf
## her3-MO_24hpf_WIK_72hpf
## mm-MO_24hpf_WIK_24hpf
## mm-MO_24hpf_WIK_72hpf
## WIK_24hpf_WIK_72hpf
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: CentOS Linux 8 (Core)
##
## Matrix products: default
## BLAS: /gpfs0/export/apps/opt/R/4.1.0-foss-2020a/lib64/R/lib/libRblas.so
## LAPACK: /gpfs0/export/apps/opt/R/4.1.0-foss-2020a/lib64/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] edgeR_3.34.0 limma_3.48.0 gt_0.3.0 forcats_0.5.1
## [5] stringr_1.4.0 dplyr_1.0.7 purrr_0.3.4 readr_2.0.1
## [9] tidyr_1.1.3 tibble_3.1.2 ggplot2_3.3.4 tidyverse_1.3.1
## [13] rmarkdown_2.9
##
## loaded via a namespace (and not attached):
## [1] fs_1.5.0 lubridate_1.7.10 bit64_4.0.5
## [4] webshot_0.5.2 RColorBrewer_1.1-2 httr_1.4.2
## [7] tools_4.1.0 backports_1.2.1 bslib_0.2.5.1
## [10] utf8_1.2.1 R6_2.5.0 DBI_1.1.1
## [13] colorspace_2.0-1 knitrBootstrap_1.0.2 withr_2.4.2
## [16] gridExtra_2.3 tidyselect_1.1.1 GGally_2.1.2
## [19] processx_3.5.2 bit_4.0.4 compiler_4.1.0
## [22] textshaping_0.3.5 cli_2.5.0 rvest_1.0.0
## [25] xml2_1.3.2 labeling_0.4.2 sass_0.4.0
## [28] scales_1.1.1 checkmate_2.0.0 callr_3.7.0
## [31] systemfonts_1.0.2 digest_0.6.27 pkgconfig_2.0.3
## [34] htmltools_0.5.1.1 dbplyr_2.1.1 highr_0.9
## [37] rlang_0.4.11 readxl_1.3.1 rstudioapi_0.13
## [40] jquerylib_0.1.4 farver_2.1.0 generics_0.1.0
## [43] jsonlite_1.7.2 vroom_1.5.4 magrittr_2.0.1
## [46] Rcpp_1.0.7 munsell_0.5.0 fansi_0.5.0
## [49] lifecycle_1.0.0 stringi_1.6.2 yaml_2.2.1
## [52] plyr_1.8.6 grid_4.1.0 parallel_4.1.0
## [55] ggrepel_0.9.1 crayon_1.4.1 lattice_0.20-44
## [58] haven_2.4.1 splines_4.1.0 hms_1.1.0
## [61] locfit_1.5-9.4 knitr_1.33 ps_1.6.0
## [64] pillar_1.6.1 markdown_1.1 codetools_0.2-18
## [67] reprex_2.0.0 glue_1.4.2 ggvenn_0.1.9
## [70] praise_1.0.0 evaluate_0.14 modelr_0.1.8
## [73] vctrs_0.3.8 tzdb_0.1.2 cellranger_1.1.0
## [76] gtable_0.3.0 reshape_0.8.8 assertthat_0.2.1
## [79] xfun_0.24 mime_0.10 broom_0.7.7
## [82] ragg_1.1.3 pheatmap_1.0.12 ellipsis_0.3.2
Styled with knitrBootstrap